In this notebook we'll be creating a number of basic charts from our data, including a histogram, box plot, and scatterplot.
In [ ]:
# To show matplotlib plots in iPython Notebook we can use an iPython magic function
%matplotlib inline
# Import everything we need
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
In [ ]:
# Import the dataset from the CSV file
accidents_data_file = '/Users/robert.dempsey/Dropbox/Private/Art of Skill Hacking/' \
'Books/Python Business Intelligence Cookbook/Data/Stats19-Data1979-2004/Accidents7904.csv'
accidents = pd.read_csv(accidents_data_file,
sep=',',
header=0,
index_col=False,
parse_dates=['Date'],
dayfirst=True,
tupleize_cols=False,
error_bad_lines=True,
warn_bad_lines=True,
skip_blank_lines=True,
low_memory=False,
nrows=1000000
)
accidents.head()
Create a histogram of the number of casualties
In [ ]:
# Create a frequency table of casualty counts from the previous recipe
casualty_count = accidents.groupby('Date').agg({'Number_of_Casualties': np.sum})
In [ ]:
# Create a histogram from the casualty count dataframe
plt.hist(casualty_count['Number_of_Casualties'],
bins=30)
plt.title('Number of Casualties Histogram')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()
In [ ]:
# Show the probability of finding a number in a bin
plt.hist(casualty_count['Number_of_Casualties'],
bins=30,
normed=True)
plt.title('Probability Distribution')
plt.xlabel('Value')
plt.ylabel('Probability')
plt.show()
In [ ]:
# Shows the probability of finding a number in a bin or any lower bin
plt.hist(casualty_count['Number_of_Casualties'],
bins=20,
normed=True,
cumulative=True)
plt.title('Cumulative Distribution')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()
In [ ]:
plt.hist(casualty_count['Number_of_Casualties'],
bins=20,
histtype='step')
plt.title('Number of Casualties Histogram')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.show()
In [ ]:
# Create a frequency table of vehicle counts
vehicle_count = accidents.groupby('Date').agg({'Number_of_Vehicles': np.sum})
In [ ]:
# Plot the two dataframes
plt.hist(casualty_count['Number_of_Casualties'], bins=20, histtype='stepfilled', normed=True, color='b', label='Casualties')
plt.hist(vehicle_count['Number_of_Vehicles'], bins=20, histtype='stepfilled', normed=True, color='r', alpha=0.5, label='Vehicles')
plt.title("Casualties/Vehicles Histogram")
plt.xlabel("Value")
plt.ylabel("Probability")
plt.legend()
plt.show()
In [ ]:
data_to_plot = [casualty_count['Number_of_Casualties'],
vehicle_count['Number_of_Vehicles']]
In [ ]:
# Create a figure instance
fig = plt.figure(1, figsize=(9, 6))
# Create an axis instance
ax = fig.add_subplot(111)
# Create the boxplot
bp = ax.boxplot(data_to_plot)
# Change the color and linewidth of the caps
for cap in bp['caps']:
cap.set(color='#7570b3', linewidth=2)
# Change the color and linewidth of the medians
for median in bp['medians']:
median.set(color='#b2df8a', linewidth=2)
# Change the style of the fliers and their fill
for flier in bp['fliers']:
flier.set(marker='o', color='#e7298a', alpha=0.5)
# Add x-axis labels
ax.set_xticklabels(['Casualties', 'Vehicles'])
# Show the figure
fig.savefig('fig1.png', bbox_inches='tight')
In [ ]:
# Create a figure instance
fig = plt.figure()
# Create an axis instance
ax = fig.add_subplot(111)
# Create the bar chart
ax.bar(range(len(casualty_count.index.values)), casualty_count['Number_of_Casualties'])
# Save the figure
fig.savefig('fig2.png')
In [ ]: